library(tidyverse)
library(cowplot)
library(randomForest)
library(pROC)
library(pheatmap)
library(epitools)
library(glmnet)
library(nlme)
library(ggforce)
library(gggenes)
library(viridis)
patient_data<-read.csv("sample_and_patient_data.csv") %>% mutate(MCoVNumber=str_remove(mcov_id, "-")) %>% mutate(collection_date=as.Date(COLLECTION_DT, "%m/%d/%y")) %>% mutate(collection_month=format(as.Date(collection_date), "%Y-%m")) %>%
mutate(CT=ifelse(INSTRUMENT_RESULT<50, INSTRUMENT_RESULT, NA_integer_)) %>% mutate(vaccine_status=if_else(Vaccine_Status=="No vaccine"|Vaccine_Status==">7 days past 1st Vaccine",0,1)) %>%
mutate(age18under=if_else(Age_Group=="00-17",1,0)) %>% mutate(age18to54=if_else(Age_Group=="18-54",1,0)) %>% mutate(age55plus=if_else(Age_Group=="55-64"|Age_Group=="65+",1,0)) %>%
select(MCoVNumber, collection_date, collection_month, run=run_group, CT, ordering_clinic=ORDERING_CLINIC_TYPE,pui=PUI, age18under, age18to54, age55plus, sex=SEX, ethnicity=Ethnicity, obesity=Obesity_YN, chronic_lung_disease=Chronic_Lung_Disease_YN, chronic_liver_disease=Chronic_Liver_Disease_YN, surveillance_sample=IS_SURVEILLANCE, chronic_heart_disease=Chronic_Heart_Disease_YN,chronic_kidney_disease=Chronic_Kidney_Disease_YN, hypertension=Hypertension_YN, diabetes=Diabetes_YN, cancer=Cancer_YN, hiv=HIV_YN, transplant_patient=Transplant_Patient, vaccine_status, admitted_hospital=Admitted_YN, highest_level=HIGHEST_LEVEL_OF_CARE, max_respiratory_support=MaxRespiratorySupport, mAb=mAb_YN, plasma=Plasma_YN)
factor_columns<- c("collection_month","run","ordering_clinic","pui","age18under","age18to54", "age55plus","sex","ethnicity","obesity","surveillance_sample", "chronic_lung_disease","chronic_liver_disease","chronic_heart_disease","chronic_kidney_disease","hypertension","diabetes","cancer","hiv","transplant_patient","vaccine_status","admitted_hospital","highest_level","max_respiratory_support","mAb","plasma")
patient_data[factor_columns]<-lapply(patient_data[factor_columns], factor)
#load file and tally minor variant richness
minor_variant_sites_threshold<-read.csv('minor_variants_filtered_200x0.02_100.csv')
mcov_samples_filtered<-mcov_samples %>% filter(!run %in% runs_to_drop) %>%
filter(qc_status=="pass") %>% filter(!MCoVNumber %in% nextclade_bad_samples) %>% filter(scorpio_call!="Omicron (BA.1-like)") %>%
##### #main coverage criterion for fair comparisons: X depth over Y percent of the genome
filter(fraction_200x_coverage>=0.98) %>% droplevels()
n_var<-minor_variant_sites_threshold %>% group_by(MCoVNumber) %>% tally()
samples_n_var<-mcov_samples_filtered %>% left_join(n_var) %>% arrange(COLLECTION_DT) %>% mutate(n_var=replace_na(n, 0))
Joining, by = "MCoVNumber"
for_patient_analysis<-samples_n_var %>% filter(INSTRUMENT_RESULT<26) %>% select(MCoVNumber, n_var, scorpio_call)
for_patient_analysis %>% pull(n_var) %>% summary
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000 0.000 1.000 2.722 3.000 140.000
for_patient_analysis %>% nrow()
[1] 5798
median_var<-for_patient_analysis %>% pull(n_var) %>% median()
#add patient metadata to sample minor variant richness
p<-left_join(for_patient_analysis, patient_data) %>% droplevels() %>% mutate(vocAlpha=if_else(startsWith(scorpio_call, "Alpha"),1,0), vocDelta=if_else(startsWith(scorpio_call, "Delta"),1,0)) %>% mutate(vocAlpha=as.factor(vocAlpha), vocDelta=as.factor(vocDelta))
Joining, by = "MCoVNumber"
#Random Forest model of patient characteristics for classifying low-CT samples as having high or low minor variant richness
p <- p %>% mutate(var_level=if_else(n_var<=median_var, "low","high")) %>% mutate(var_level=as.factor(var_level))
p_select <- p %>% select(age18under, age18to54, age55plus, sex, ethnicity, chronic_lung_disease, chronic_liver_disease, chronic_kidney_disease, chronic_heart_disease, hypertension, diabetes, cancer, obesity, transplant_patient, plasma, mAb, vaccine_status, admitted_hospital,surveillance_sample, var_level)
p.rf<-randomForest(var_level~.,
data=p_select,
ntree=1000,
mtry=5,
importance = T)
importance.randomForest <- as.data.frame(randomForest::importance(p.rf))
importance.randomForest<-importance.randomForest %>% arrange(desc(MeanDecreaseAccuracy))
importance.randomForest
rf.roc<-roc(p_select$var_level,p.rf$votes[,2], levels=c(case="high",control="low"))
Setting direction: controls < cases
auc(rf.roc)
Area under the curve: 0.5961
categories<-p %>% mutate(minor_greater_0=if_else(n_var>0,'yes','no')) %>% mutate(minor_greater_5=if_else(n_var>5,'yes','no')) %>% mutate(minor_greater_10=if_else(n_var>10,'yes','no'))
categories1<-categories %>% group_by(admitted_hospital, minor_greater_0) %>% tally() %>% ggplot(aes(x=admitted_hospital, y=n, fill=minor_greater_0)) + geom_bar(stat="identity") + scale_fill_manual(values=c("gray","black")) + theme_bw()
categories2<-categories %>% group_by(admitted_hospital, minor_greater_5) %>% tally() %>% ggplot(aes(x=admitted_hospital, y=n, fill=minor_greater_5)) + geom_bar(stat="identity") + scale_fill_manual(values=c("gray","black")) + theme_bw()
categories3<-categories %>% group_by(admitted_hospital, minor_greater_10) %>% tally() %>% ggplot(aes(x=admitted_hospital, y=n, fill=minor_greater_10)) + geom_bar(stat="identity") + scale_fill_manual(values=c("gray","black")) + theme_bw()
plot_grid(categories1, categories2, categories3, nrow=3)
hospitalization<-table(p$admitted_hospital,p$var_level)
hospitalization
high low
0 1302 2379
1 1090 1027
chisq.test(hospitalization)
Pearson's Chi-squared test with Yates' continuity correction
data: hospitalization
X-squared = 143.39, df = 1, p-value < 2.2e-16
oddsratio(hospitalization, rev="columns")
$data
low high Total
0 2379 1302 3681
1 1027 1090 2117
Total 3406 2392 5798
$measure
odds ratio with 95% C.I.
estimate lower upper
0 1.000000 NA NA
1 1.938966 1.739307 2.162093
$p.value
two-sided
midp.exact fisher.exact chi.square
0 NA NA NA
1 0 6.637658e-33 3.452345e-33
$correction
[1] FALSE
attr(,"method")
[1] "median-unbiased estimate & mid-p exact CI"
#linear mixed-effect model with sequencing run as random effect
lme(log10(n_var+1) ~ CT*admitted_hospital, random=~1|run,
data=p) %>% anova()
hosp_altdata1<-p %>% ggplot(aes(x=CT, y=log10(n_var+1), color=admitted_hospital)) + geom_point(alpha=0.5) + scale_color_manual(values=c("black","darkred")) + geom_smooth(method=lm) + theme_bw() + annotate("text", x=10, y=2, label="Ct p<0.0001 \nhospitalization p<0.0001 \nCt*hospitalization p<0.0001") + theme(legend.position="bottom") + ylim(0,2.6)
#when did most cases among vaccinated patients occur?
table(p$collection_month, p$vaccine_status)
0 1
2020-12 1478 0
2021-01 883 0
2021-02 351 1
2021-03 379 7
2021-04 265 12
2021-05 180 16
2021-06 68 10
2021-07 302 122
2021-08 873 370
2021-09 149 93
2021-10 71 48
2021-11 62 58
vax_status_subset<- samples_n_var %>% filter(INSTRUMENT_RESULT<35) %>% select(MCoVNumber, n_var, scorpio_call) %>% left_join(patient_data) %>%
filter(collection_date>="2021-07-01") %>% filter(admitted_hospital==0)
Joining, by = "MCoVNumber"
lme(log10(n_var+1) ~ CT*vaccine_status, random=~1|run,
data=vax_status_subset) %>% anova()
NA
#to see effect of vaccination (as a correlate of disease severity): limit to later than July and only non-hospitalized patients
vax_altdata1<-vax_status_subset %>% ggplot(aes(x=CT, y=log10(n_var+1), color=vaccine_status)) + geom_point(alpha=0.5) + scale_color_manual(values=c("black","lightblue")) + geom_smooth(method=lm) + theme_bw() + annotate("text", x=14, y=2, label="Ct p<0.0001 \nvaccination p=0.0119 \nCt*vaccination p=0.055") + theme(legend.position="bottom") + ylim(0,2.6)
#healthcare worker surveillance (presumed mostly asymptomatic) vs non-hospitalized patients (presumed mostly symptomatic)
surv_subset <- samples_n_var %>% filter(INSTRUMENT_RESULT<35) %>% select(MCoVNumber, n_var, scorpio_call) %>% left_join(patient_data) %>%
filter(admitted_hospital==0) %>% filter(!(surveillance_sample==1 & pui=="PUI")) #exclude HCW who were patients
Joining, by = "MCoVNumber"
lme(log10(n_var+1) ~ CT*surveillance_sample, random=~1|run,
data=surv_subset) %>% anova()
hcw_altdata1<-surv_subset %>% ggplot(aes(x=CT, y=log10(n_var+1), color=surveillance_sample)) + geom_point(alpha=0.5) + scale_color_manual(values=c("black","darkgreen")) + geom_smooth(method=lm) + theme_bw() + annotate("text", x=12, y=2, label="Ct p<0.0001 \nHCW surveillance p=0.007 \nCt*surveillance p=0.0076")+ theme(legend.position="bottom") + ylim(0,2.6)
#LASSO regression model including all sample and patient characteristics to explain minor variant richness
p_sub<- p %>% left_join(mcov_samples) #to add info about coverage
Joining, by = c("MCoVNumber", "scorpio_call", "run")
y<-log10(p_sub$n_var+1)
x<-data.matrix(p_sub[, c('age18under','age55plus','sex','chronic_lung_disease', 'chronic_liver_disease', 'chronic_kidney_disease', 'chronic_heart_disease', 'hypertension', 'diabetes', 'cancer', 'obesity', 'plasma', 'mAb', 'admitted_hospital','vaccine_status','vocAlpha','vocDelta','collection_month','surveillance_sample','CT','median_coverage','run')])
cv_model <- cv.glmnet(x, y, alpha = 1, nfolds=100)
plot(cv_model)
best_model <- glmnet(x, y, alpha = 1, lambda = cv_model$lambda.min)
best_model$dev.ratio
[1] 0.1432548
lasso1_altdata1<-coef(best_model) %>% as.matrix() %>% data.frame() %>% rownames_to_column() %>% rename(factor=rowname, coefficient=s0) %>% filter(factor!="(Intercept)") %>% arrange(desc(coefficient)) %>% ggplot(aes(x=coefficient, y=fct_reorder(factor,coefficient))) + geom_bar(stat="identity", fill="#E2D200", color="gray") + theme_bw() + ylab("Factor") + xlab("Coefficient")
#same model excluding any factors with a temporal signal (VOC, vaccination, collection month, run)
y2<-log10(p_sub$n_var+1)
x2<-data.matrix(p_sub[, c('age18under','age55plus','sex','chronic_lung_disease', 'chronic_liver_disease', 'chronic_kidney_disease', 'chronic_heart_disease', 'hypertension', 'diabetes', 'cancer', 'obesity', 'plasma', 'mAb', 'admitted_hospital', 'surveillance_sample', 'CT', 'median_coverage')])
cv_model_2 <- cv.glmnet(x2, y2, alpha = 1, nfolds=100)
plot(cv_model_2)
best_model_2 <- glmnet(x2, y2, alpha = 1, lambda = cv_model$lambda.min)
best_model_2$dev.ratio
[1] 0.1144788
lasso2_altdata1<-coef(best_model_2) %>% as.matrix() %>% data.frame() %>% rownames_to_column() %>% rename(factor=rowname, coefficient=s0) %>% filter(factor!="(Intercept)") %>% arrange(desc(coefficient)) %>% ggplot(aes(x=coefficient, y=fct_reorder(factor,coefficient))) + geom_bar(stat="identity", fill="#E2D200", color="black") + theme_bw() + ylab("Factor") + xlab("Coefficient")
title <- ggdraw() +
draw_label(
"Alternate dataset 1",
fontface = 'bold',
x = 0,
hjust = 0)
all_plots_altdata1<-plot_grid(plot_grid(hosp_altdata1, vax_altdata1, hcw_altdata1, ncol=3), plot_grid(lasso1_altdata1, lasso2_altdata1, ncol=2), nrow=2, rel_heights = c(2,1))
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 3 rows containing missing values (geom_smooth).
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 10 rows containing missing values (geom_smooth).
alt_1<-plot_grid(title, all_plots_altdata1, nrow=2, rel_heights=c(0.1,1))
#load file and tally minor variant richness
minor_variant_sites_threshold<-read.csv('minor_variants_filtered_500x0.03_20.csv')
mcov_samples_filtered<-mcov_samples %>% filter(!run %in% runs_to_drop) %>%
filter(qc_status=="pass") %>% filter(!MCoVNumber %in% nextclade_bad_samples) %>% filter(scorpio_call!="Omicron (BA.1-like)") %>%
##### #main coverage criterion for fair comparisons: X depth over Y percent of the genome
filter(fraction_500x_coverage>=0.98) %>% droplevels()
n_var<-minor_variant_sites_threshold %>% group_by(MCoVNumber) %>% tally()
samples_n_var<-mcov_samples_filtered %>% left_join(n_var) %>% arrange(COLLECTION_DT) %>% mutate(n_var=replace_na(n, 0))
Joining, by = "MCoVNumber"
for_patient_analysis<-samples_n_var %>% filter(INSTRUMENT_RESULT<26) %>% select(MCoVNumber, n_var, scorpio_call)
for_patient_analysis %>% pull(n_var) %>% summary
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000 0.000 1.000 1.621 2.000 99.000
for_patient_analysis %>% nrow()
[1] 4869
median_var<-for_patient_analysis %>% pull(n_var) %>% median()
#add patient metadata to sample minor variant richness
p<-left_join(for_patient_analysis, patient_data) %>% droplevels() %>% mutate(vocAlpha=if_else(startsWith(scorpio_call, "Alpha"),1,0), vocDelta=if_else(startsWith(scorpio_call, "Delta"),1,0)) %>% mutate(vocAlpha=as.factor(vocAlpha), vocDelta=as.factor(vocDelta))
Joining, by = "MCoVNumber"
#Random Forest model of patient characteristics for classifying low-CT samples as having high or low minor variant richness
p <- p %>% mutate(var_level=if_else(n_var<=median_var, "low","high")) %>% mutate(var_level=as.factor(var_level))
p_select <- p %>% select(age18under, age18to54, age55plus, sex, ethnicity, chronic_lung_disease, chronic_liver_disease, chronic_kidney_disease, chronic_heart_disease, hypertension, diabetes, cancer, obesity, transplant_patient, plasma, mAb, vaccine_status, admitted_hospital,surveillance_sample, var_level)
p.rf<-randomForest(var_level~.,
data=p_select,
ntree=1000,
mtry=5,
importance = T)
importance.randomForest <- as.data.frame(randomForest::importance(p.rf))
importance.randomForest<-importance.randomForest %>% arrange(desc(MeanDecreaseAccuracy))
importance.randomForest
rf.roc<-roc(p_select$var_level,p.rf$votes[,2], levels=c(case="high",control="low"))
Setting direction: controls < cases
auc(rf.roc)
Area under the curve: 0.583
categories<-p %>% mutate(minor_greater_0=if_else(n_var>0,'yes','no')) %>% mutate(minor_greater_5=if_else(n_var>5,'yes','no')) %>% mutate(minor_greater_10=if_else(n_var>10,'yes','no'))
categories1<-categories %>% group_by(admitted_hospital, minor_greater_0) %>% tally() %>% ggplot(aes(x=admitted_hospital, y=n, fill=minor_greater_0)) + geom_bar(stat="identity") + scale_fill_manual(values=c("gray","black")) + theme_bw()
categories2<-categories %>% group_by(admitted_hospital, minor_greater_5) %>% tally() %>% ggplot(aes(x=admitted_hospital, y=n, fill=minor_greater_5)) + geom_bar(stat="identity") + scale_fill_manual(values=c("gray","black")) + theme_bw()
categories3<-categories %>% group_by(admitted_hospital, minor_greater_10) %>% tally() %>% ggplot(aes(x=admitted_hospital, y=n, fill=minor_greater_10)) + geom_bar(stat="identity") + scale_fill_manual(values=c("gray","black")) + theme_bw()
plot_grid(categories1, categories2, categories3, nrow=3)
hospitalization<-table(p$admitted_hospital,p$var_level)
hospitalization
high low
0 754 2428
1 640 1047
chisq.test(hospitalization)
Pearson's Chi-squared test with Yates' continuity correction
data: hospitalization
X-squared = 108.74, df = 1, p-value < 2.2e-16
oddsratio(hospitalization, rev="columns")
$data
low high Total
0 2428 754 3182
1 1047 640 1687
Total 3475 1394 4869
$measure
odds ratio with 95% C.I.
estimate lower upper
0 1.000000 NA NA
1 1.968102 1.731812 2.23668
$p.value
two-sided
midp.exact fisher.exact chi.square
0 NA NA NA
1 0 4.812526e-25 1.305533e-25
$correction
[1] FALSE
attr(,"method")
[1] "median-unbiased estimate & mid-p exact CI"
#linear mixed-effect model with sequencing run as random effect
lme(log10(n_var+1) ~ CT*admitted_hospital, random=~1|run,
data=p) %>% anova()
hosp_altdata2<-p %>% ggplot(aes(x=CT, y=log10(n_var+1), color=admitted_hospital)) + geom_point(alpha=0.5) + scale_color_manual(values=c("black","darkred")) + geom_smooth(method=lm) + theme_bw() + annotate("text", x=10, y=1.75, label="Ct p<0.0001 \nhospitalization p<0.0001 \nCt*hospitalization p=0.0059") + theme(legend.position="bottom") + ylim(0,2.6)
hosp_altdata2
`geom_smooth()` using formula 'y ~ x'
vax_status_subset<- samples_n_var %>% filter(INSTRUMENT_RESULT<35) %>% select(MCoVNumber, n_var, scorpio_call) %>% left_join(patient_data) %>%
filter(collection_date>="2021-07-01") %>% filter(admitted_hospital==0)
Joining, by = "MCoVNumber"
lme(log10(n_var+1) ~ CT*vaccine_status, random=~1|run,
data=vax_status_subset) %>% anova()
#to see effect of vaccination (as a correlate of disease severity): limit to later than July and only non-hospitalized patients
vax_altdata2<-vax_status_subset %>% ggplot(aes(x=CT, y=log10(n_var+1), color=vaccine_status)) + geom_point(alpha=0.5) + scale_color_manual(values=c("black","lightblue")) + geom_smooth(method=lm) + theme_bw() + annotate("text", x=14, y=1.75, label="Ct p<0.0001 \nvaccination p=0.0671 \nCt*vaccination p=0.0226") + theme(legend.position="bottom") + ylim(0,2.6)
vax_altdata2
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 5 rows containing missing values (geom_smooth).
#healthcare worker surveillance (presumed mostly asymptomatic) vs non-hospitalized patients (presumed mostly symptomatic)
surv_subset <- samples_n_var %>% filter(INSTRUMENT_RESULT<35) %>% select(MCoVNumber, n_var, scorpio_call) %>% left_join(patient_data) %>%
filter(admitted_hospital==0) %>% filter(!(surveillance_sample==1 & pui=="PUI")) #exclude HCW who were patients
Joining, by = "MCoVNumber"
lme(log10(n_var+1) ~ CT*surveillance_sample, random=~1|run,
data=surv_subset) %>% anova()
hcw_altdata2<-surv_subset %>% ggplot(aes(x=CT, y=log10(n_var+1), color=surveillance_sample)) + geom_point(alpha=0.5) + scale_color_manual(values=c("black","darkgreen")) + geom_smooth(method=lm) + theme_bw() + annotate("text", x=12, y=1.75, label="Ct p<0.0001 \nHCW surveillance p=0.0077 \nCt*surveillance p=0.0001")+ theme(legend.position="bottom") + ylim(0,2.6)
hcw_altdata2
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 11 rows containing missing values (geom_smooth).
#LASSO regression model including all sample and patient characteristics to explain minor variant richness
p_sub<- p %>% left_join(mcov_samples) #to add info about coverage
Joining, by = c("MCoVNumber", "scorpio_call", "run")
y<-log10(p_sub$n_var+1)
x<-data.matrix(p_sub[, c('age18under','age55plus','sex','chronic_lung_disease', 'chronic_liver_disease', 'chronic_kidney_disease', 'chronic_heart_disease', 'hypertension', 'diabetes', 'cancer', 'obesity', 'plasma', 'mAb', 'admitted_hospital','vaccine_status','vocAlpha','vocDelta','collection_month','surveillance_sample','CT','median_coverage','run')])
cv_model <- cv.glmnet(x, y, alpha = 1, nfolds=100)
plot(cv_model)
best_model <- glmnet(x, y, alpha = 1, lambda = cv_model$lambda.min)
best_model$dev.ratio
[1] 0.1014136
lasso1_altdata2<-coef(best_model) %>% as.matrix() %>% data.frame() %>% rownames_to_column() %>% rename(factor=rowname, coefficient=s0) %>% filter(factor!="(Intercept)") %>% arrange(desc(coefficient)) %>% ggplot(aes(x=coefficient, y=fct_reorder(factor,coefficient))) + geom_bar(stat="identity", fill="#E2D200", color="gray") + theme_bw() + ylab("Factor") + xlab("Coefficient")
#same model excluding any factors with a temporal signal (VOC, vaccination, collection month, run)
y2<-log10(p_sub$n_var+1)
x2<-data.matrix(p_sub[, c('age18under','age55plus','sex','chronic_lung_disease', 'chronic_liver_disease', 'chronic_kidney_disease', 'chronic_heart_disease', 'hypertension', 'diabetes', 'cancer', 'obesity', 'plasma', 'mAb', 'admitted_hospital', 'surveillance_sample', 'CT', 'median_coverage')])
cv_model_2 <- cv.glmnet(x2, y2, alpha = 1, nfolds=100)
plot(cv_model_2)
best_model_2 <- glmnet(x2, y2, alpha = 1, lambda = cv_model$lambda.min)
best_model_2$dev.ratio
[1] 0.07680036
lasso2_altdata2<-coef(best_model_2) %>% as.matrix() %>% data.frame() %>% rownames_to_column() %>% rename(factor=rowname, coefficient=s0) %>% filter(factor!="(Intercept)") %>% arrange(desc(coefficient)) %>% ggplot(aes(x=coefficient, y=fct_reorder(factor,coefficient))) + geom_bar(stat="identity", fill="#E2D200", color="black") + theme_bw() + ylab("Factor") + xlab("Coefficient")
title <- ggdraw() +
draw_label(
"Alternate dataset 2",
fontface = 'bold',
x = 0,
hjust = 0)
all_plots_altdata2<-plot_grid(plot_grid(hosp_altdata2, vax_altdata2, hcw_altdata2, ncol=3), plot_grid(lasso1_altdata2, lasso2_altdata2, ncol=2), nrow=2, rel_heights = c(2,1))
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 5 rows containing missing values (geom_smooth).
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 11 rows containing missing values (geom_smooth).
alt_2<-plot_grid(title, all_plots_altdata2, nrow=2, rel_heights=c(0.1,1))
alt_2
#load file and tally minor variant richness
minor_variant_sites_threshold<-read.csv('minor_variants_filtered_1000x0.01.csv')
mcov_samples_filtered<-mcov_samples %>% filter(!run %in% runs_to_drop) %>%
filter(qc_status=="pass") %>% filter(!MCoVNumber %in% nextclade_bad_samples) %>% filter(scorpio_call!="Omicron (BA.1-like)") %>%
##### #main coverage criterion for fair comparisons: X depth over Y percent of the genome
filter(fraction_1000x_coverage>=0.98) %>% droplevels()
n_var<-minor_variant_sites_threshold %>% group_by(MCoVNumber) %>% tally()
samples_n_var<-mcov_samples_filtered %>% left_join(n_var) %>% arrange(COLLECTION_DT) %>% mutate(n_var=replace_na(n, 0))
Joining, by = "MCoVNumber"
for_patient_analysis<-samples_n_var %>% filter(INSTRUMENT_RESULT<26) %>% select(MCoVNumber, n_var, scorpio_call)
for_patient_analysis %>% pull(n_var) %>% summary
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000 3.000 6.000 8.193 10.000 275.000
for_patient_analysis %>% nrow()
[1] 3486
median_var<-for_patient_analysis %>% pull(n_var) %>% median()
#add patient metadata to sample minor variant richness
p<-left_join(for_patient_analysis, patient_data) %>% droplevels() %>% mutate(vocAlpha=if_else(startsWith(scorpio_call, "Alpha"),1,0), vocDelta=if_else(startsWith(scorpio_call, "Delta"),1,0)) %>% mutate(vocAlpha=as.factor(vocAlpha), vocDelta=as.factor(vocDelta))
Joining, by = "MCoVNumber"
#Random Forest model of patient characteristics for classifying low-CT samples as having high or low minor variant richness
p <- p %>% mutate(var_level=if_else(n_var<=median_var, "low","high")) %>% mutate(var_level=as.factor(var_level))
p_select <- p %>% select(age18under, age18to54, age55plus, sex, ethnicity, chronic_lung_disease, chronic_liver_disease, chronic_kidney_disease, chronic_heart_disease, hypertension, diabetes, cancer, obesity, transplant_patient, plasma, mAb, vaccine_status, admitted_hospital,surveillance_sample, var_level)
p.rf<-randomForest(var_level~.,
data=p_select,
ntree=1000,
mtry=5,
importance = T)
importance.randomForest <- as.data.frame(randomForest::importance(p.rf))
importance.randomForest<-importance.randomForest %>% arrange(desc(MeanDecreaseAccuracy))
importance.randomForest
rf.roc<-roc(p_select$var_level,p.rf$votes[,2], levels=c(case="high",control="low"))
Setting direction: controls < cases
auc(rf.roc)
Area under the curve: 0.5994
categories<-p %>% mutate(minor_greater_0=if_else(n_var>0,'yes','no')) %>% mutate(minor_greater_5=if_else(n_var>5,'yes','no')) %>% mutate(minor_greater_10=if_else(n_var>10,'yes','no'))
categories1<-categories %>% group_by(admitted_hospital, minor_greater_0) %>% tally() %>% ggplot(aes(x=admitted_hospital, y=n, fill=minor_greater_0)) + geom_bar(stat="identity") + scale_fill_manual(values=c("gray","black")) + theme_bw()
categories2<-categories %>% group_by(admitted_hospital, minor_greater_5) %>% tally() %>% ggplot(aes(x=admitted_hospital, y=n, fill=minor_greater_5)) + geom_bar(stat="identity") + scale_fill_manual(values=c("gray","black")) + theme_bw()
categories3<-categories %>% group_by(admitted_hospital, minor_greater_10) %>% tally() %>% ggplot(aes(x=admitted_hospital, y=n, fill=minor_greater_10)) + geom_bar(stat="identity") + scale_fill_manual(values=c("gray","black")) + theme_bw()
plot_grid(categories1, categories2, categories3, nrow=3)
hospitalization<-table(p$admitted_hospital,p$var_level)
hospitalization
high low
0 847 1478
1 640 521
chisq.test(hospitalization)
Pearson's Chi-squared test with Yates' continuity correction
data: hospitalization
X-squared = 109.87, df = 1, p-value < 2.2e-16
oddsratio(hospitalization, rev="columns")
$data
low high Total
0 1478 847 2325
1 521 640 1161
Total 1999 1487 3486
$measure
odds ratio with 95% C.I.
estimate lower upper
0 1.00000 NA NA
1 2.14292 1.85737 2.47349
$p.value
two-sided
midp.exact fisher.exact chi.square
0 NA NA NA
1 0 1.182952e-25 7.106677e-26
$correction
[1] FALSE
attr(,"method")
[1] "median-unbiased estimate & mid-p exact CI"
#linear mixed-effect model with sequencing run as random effect
lme(log10(n_var+1) ~ CT*admitted_hospital, random=~1|run,
data=p) %>% anova()
hosp_altdata3<-p %>% ggplot(aes(x=CT, y=log10(n_var+1), color=admitted_hospital)) + geom_point(alpha=0.5) + scale_color_manual(values=c("black","darkred")) + geom_smooth(method=lm) + theme_bw() + annotate("text", x=10, y=2, label="Ct p<0.0001 \nhospitalization p<0.0001 \nCt*hospitalization p<0.0001") + theme(legend.position="bottom") + ylim(0,2.6)
hosp_altdata3
`geom_smooth()` using formula 'y ~ x'
vax_status_subset<- samples_n_var %>% filter(INSTRUMENT_RESULT<35) %>% select(MCoVNumber, n_var, scorpio_call) %>% left_join(patient_data) %>%
filter(collection_date>="2021-07-01") %>% filter(admitted_hospital==0)
Joining, by = "MCoVNumber"
lme(log10(n_var+1) ~ CT*vaccine_status, random=~1|run,
data=vax_status_subset) %>% anova()
NA
#to see effect of vaccination (as a correlate of disease severity): limit to later than July and only non-hospitalized patients
vax_altdata3<-vax_status_subset %>% ggplot(aes(x=CT, y=log10(n_var+1), color=vaccine_status)) + geom_point(alpha=0.5) + scale_color_manual(values=c("black","lightblue")) + geom_smooth(method=lm) + theme_bw() + annotate("text", x=14, y=2, label="Ct p<0.0001 \nvaccination p=0.1098 \nCt*vaccination p=0.56") + theme(legend.position="bottom") + ylim(0,2.6)
vax_altdata3
`geom_smooth()` using formula 'y ~ x'
#healthcare worker surveillance (presumed mostly asymptomatic) vs non-hospitalized patients (presumed mostly symptomatic)
surv_subset <- samples_n_var %>% filter(INSTRUMENT_RESULT<35) %>% select(MCoVNumber, n_var, scorpio_call) %>% left_join(patient_data) %>%
filter(admitted_hospital==0) %>% filter(!(surveillance_sample==1 & pui=="PUI")) #exclude HCW who were patients
Joining, by = "MCoVNumber"
lme(log10(n_var+1) ~ CT*surveillance_sample, random=~1|run,
data=surv_subset) %>% anova()
hcw_altdata3<-surv_subset %>% ggplot(aes(x=CT, y=log10(n_var+1), color=surveillance_sample)) + geom_point(alpha=0.5) + scale_color_manual(values=c("black","darkgreen")) + geom_smooth(method=lm) + theme_bw() + annotate("text", x=12, y=2, label="Ct p<0.0001 \nHCW surveillance p=0.449 \nCt*surveillance p=0.0019")+ theme(legend.position="bottom") + ylim(0,2.6)
hcw_altdata3
`geom_smooth()` using formula 'y ~ x'
#LASSO regression model including all sample and patient characteristics to explain minor variant richness
p_sub<- p %>% left_join(mcov_samples) #to add info about coverage
Joining, by = c("MCoVNumber", "scorpio_call", "run")
y<-log10(p_sub$n_var+1)
x<-data.matrix(p_sub[, c('age18under','age55plus','sex','chronic_lung_disease', 'chronic_liver_disease', 'chronic_kidney_disease', 'chronic_heart_disease', 'hypertension', 'diabetes', 'cancer', 'obesity', 'plasma', 'mAb', 'admitted_hospital','vaccine_status','vocAlpha','vocDelta','collection_month','surveillance_sample','CT','median_coverage','run')])
cv_model <- cv.glmnet(x, y, alpha = 1, nfolds=100)
plot(cv_model)
best_model <- glmnet(x, y, alpha = 1, lambda = cv_model$lambda.min)
best_model$dev.ratio
[1] 0.2848387
lasso1_altdata3<-coef(best_model) %>% as.matrix() %>% data.frame() %>% rownames_to_column() %>% rename(factor=rowname, coefficient=s0) %>% filter(factor!="(Intercept)") %>% arrange(desc(coefficient)) %>% ggplot(aes(x=coefficient, y=fct_reorder(factor,coefficient))) + geom_bar(stat="identity", fill="#E2D200", color="gray") + theme_bw() + ylab("Factor") + xlab("Coefficient")
#same model excluding any factors with a temporal signal (VOC, vaccination, collection month, run)
y2<-log10(p_sub$n_var+1)
x2<-data.matrix(p_sub[, c('age18under','age55plus','sex','chronic_lung_disease', 'chronic_liver_disease', 'chronic_kidney_disease', 'chronic_heart_disease', 'hypertension', 'diabetes', 'cancer', 'obesity', 'plasma', 'mAb', 'admitted_hospital', 'surveillance_sample', 'CT', 'median_coverage')])
cv_model_2 <- cv.glmnet(x2, y2, alpha = 1, nfolds=100)
plot(cv_model_2)
best_model_2 <- glmnet(x2, y2, alpha = 1, lambda = cv_model$lambda.min)
best_model_2$dev.ratio
[1] 0.1660055
lasso2_altdata3<-coef(best_model_2) %>% as.matrix() %>% data.frame() %>% rownames_to_column() %>% rename(factor=rowname, coefficient=s0) %>% filter(factor!="(Intercept)") %>% arrange(desc(coefficient)) %>% ggplot(aes(x=coefficient, y=fct_reorder(factor,coefficient))) + geom_bar(stat="identity", fill="#E2D200", color="black") + theme_bw() + ylab("Factor") + xlab("Coefficient")
title <- ggdraw() +
draw_label(
"Alternate dataset 3",
fontface = 'bold',
x = 0,
hjust = 0)
all_plots_altdata3<-plot_grid(plot_grid(hosp_altdata3, vax_altdata3, hcw_altdata3, ncol=3), plot_grid(lasso1_altdata3, lasso2_altdata3, ncol=2), nrow=2, rel_heights = c(2,1))
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'
alt_3<-plot_grid(title, all_plots_altdata3, nrow=2, rel_heights=c(0.1,1))
alt_3